import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
le=preprocessing.LabelEncoder()


df = pd.read_csv('Gradesv2.csv')


df.head()


df.columns #Viewing Columns

Index(['Subject Yr Student ID', 'SGV', 'Assignment \nTotal: 100',
       'Test Total: 100', 'Individual Presentation \nTotal: 100',
       'Grp Project\nTotal: 100', 'Subject Mark', 'Subject Grade',
       'Subject Grade Point', 'Gender', 'Age', 'Institute', 'Programme',
       'Admission GPA', 'Unnamed: 14'],
      dtype='object')


df.info() #Viewing information about each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Subject Yr Student ID                131 non-null    object 
 1   SGV                                  131 non-null    int64  
 2   Assignment 
Total: 100               131 non-null    int64  
 3   Test Total: 100                      131 non-null    float64
 4   Individual Presentation 
Total: 100  131 non-null    float64
 5   Grp Project
Total: 100               131 non-null    int64  
 6   Subject Mark                         131 non-null    float64
 7   Subject Grade                        131 non-null    object 
 8   Subject Grade Point                  131 non-null    float64
 9   Gender                               131 non-null    object 
 10  Age                                  129 non-null    float64
 11  Institute                            128 non-null    object 
 12  Programme                            128 non-null    object 
 13  Admission GPA                        128 non-null    object 
 14  Unnamed: 14                          3 non-null      object 
dtypes: float64(5), int64(3), object(7)
memory usage: 15.5+ KB


percent_missing = df.isnull().sum() * 100 / len(df) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing

Subject Yr Student ID                     0.0%
SGV                                       0.0%
Assignment \nTotal: 100                   0.0%
Test Total: 100                           0.0%
Individual Presentation \nTotal: 100      0.0%
Grp Project\nTotal: 100                   0.0%
Subject Mark                              0.0%
Subject Grade                             0.0%
Subject Grade Point                       0.0%
Gender                                    0.0%
Age                                      1.53%
Institute                                2.29%
Programme                                2.29%
Admission GPA                            2.29%
Unnamed: 14                             97.71%
dtype: object


dup_percentage = df.duplicated().sum()/len(df)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage

'0.0%'


# Dropping row with high missing values
df = df.drop('Unnamed: 14', 1)


# Renaming Columns to be more meaningful
df.rename(columns = {'Subject Yr Student ID':'ID','Test Total: 100':'Test'}, inplace = True)
df.rename(columns = {'Assignment \nTotal: 100':'Assignment'}, inplace = True)
df.rename(columns = {'Individual Presentation \nTotal: 100':'Individual Presentation'}, inplace = True)
df.rename(columns = {'Grp Project\nTotal: 100':'Grp Project'}, inplace = True)


# Arranging Columns
cols = ['Subject Yr Student ID','SGV', 'Assignment', 'Test', 'Individual Presentation', 'Grp Project', 
        'Subject Mark', 'Subject Grade', 'Subject Grade Point', 'Gender', 'Age', 'Institute', 'Programme', 
        'Admission GPA']


# simply drop whole row with NaN in "Age" column
df.dropna(subset=["Institute","Programme"], axis=0, inplace=True)

# reset index, because we droped 
df.reset_index(drop=True, inplace=True)


# Checking for missing values after cleaning
percent_missing = df.isnull().sum() * 100 / len(df) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing

ID                         0.0%
SGV                        0.0%
Assignment                 0.0%
Test                       0.0%
Individual Presentation    0.0%
Grp Project                0.0%
Subject Mark               0.0%
Subject Grade              0.0%
Subject Grade Point        0.0%
Gender                     0.0%
Age                        0.0%
Institute                  0.0%
Programme                  0.0%
Admission GPA              0.0%
dtype: object


df.dtypes # Checking Datatypes

ID                          object
SGV                          int64
Assignment                   int64
Test                       float64
Individual Presentation    float64
Grp Project                  int64
Subject Mark               float64
Subject Grade               object
Subject Grade Point        float64
Gender                      object
Age                        float64
Institute                   object
Programme                   object
Admission GPA               object
dtype: object


df['Age'] = df['Age'].astype('Int64') # Setting Datatype


df['Admission GPA'] = pd.to_numeric(df['Admission GPA'], errors='coerce').astype('float64') # Setting Datatype


#Admission GPA rounding up gpa to 1 D.P
df['Admission GPA'].round(decimals = 1)

0      3.2
1      2.9
2      2.7
3      3.2
4      2.4
      ... 
123    1.6
124    2.8
125    3.0
126    2.7
127    2.4
Name: Admission GPA, Length: 128, dtype: float64


df.head()


df['Programme'].value_counts() # Counting values in each column

Associate in Information Technology                               40
Associate of Science in Information Systems Development            8
Higher Diploma in Software Engineering                             5
Associate of Engineering                                           4
Higher Diploma in Telecommunications and Networking                4
Associate in Engineering                                           4
Associate in Business                                              3
Higher Diploma in Information Technology                           3
AD in Information Systems Development                              2
Higher Diploma in Cloud and Data Centre Administration             2
Higher Diploma in Computer Studies                                 2
Higher Diploma in Mobile Applications Development                  2
Associate Degree Mobile Information Technology                     2
Associate in Degree Engineering                                    1
Associate Degree in Mobile Information Technology                  1
Associate Degree in Construction Engineering and Management        1
Higher Diploma in Ophthalmic Dispensing                            1
Higher Diploma in Multimedia                                       1
HD in Cloud and Data Centre Administration                         1
Higher Diploma in Computer Network Technology                      1
Associate Degree Information Systems Development                   1
Ascisd                                                             1
Associate of Social Sciences (Sociology Theme)                     1
|Associate of Science in Network and Systems Administration        1
Higher Diploma on Hotel Management                                 1
HD in Electronic and Information Engineering                       1
Advance Certificate in Games Development (Game Design)             1
Associate of Arts in Media, Cultural and Creative Studies          1
Higher Diploma in Dispensing Studies                               1
Higher Diploma in Aviation Studies                                 1
Associate in Statistics and Data Science                           1
Bachelor of Information Technology                                 1
Higher Diploma Programme in Computer Game Development              1
Associate of Science in Network and Systems Administration         1
Hd in Mobile Application Development and Cloud Services            1
Associate of Information Systems Development                       1
Associate in Information System Development                        1
Associate of Science in Business Analysis                          1
HD in Games and Animation                                          1
Associate in Language and Culture                                  1
Associate of Business Administration in General Management         1
|Associate Degree of Science in Information System Development     1
Associate of Science in Surveying                                  1
Bachelor's degree in Social Sciences (Honours) in Psychology       1
Associate of NSA                                                   1
Associate of Science                                               1
HD in Telecommunications and Networking                            1
HD in Information Technology                                       1
HD in Tourism and Events Management                                1
Bachelor of Social Sciences                                        1
HD in Systems Development & Administration                         1
Higher Diploma in Automotive Engineering                           1
HD in Software Engineering                                         1
AD in Netwrok and System Administration                            1
Associate in Applied Social Sciences|                              1
HD in Computer Game Development                                    1
Higher Diploma in Aircraft Maintenance Engineering                 1
AD in Mobile Information Technology                                1
AD in Film, Television and Digital Media Studies                   1
AD in Information Technology                                       1
Name: Programme, dtype: int64


df['Programme'] = df.Programme.str.replace('|','')


df['Programme'] = df.Programme.str.replace('AD','Associate Degree')


df['Programme'] = df.Programme.str.replace('Associate in','Associate Degree in')


df['Programme'] = df.Programme.str.replace('Associate of','Associate Degree in')


df['Programme'] = df.Programme.str.replace('HD','Higher Diploma in')


df['Programme'] = df.Programme.str.replace('Hd','Higher Diploma in')


df['Programme'] = df.Programme.str.replace('in in','in')


df['Programme'] = df.Programme.str.replace('Netwrok','Network')


df = df.sort_values(by=['Programme'])


df['Programme'].value_counts()

Associate Degree in Information Technology                             41
Associate Degree in Science in Information Systems Development          8
Associate Degree in Engineering                                         8
Higher Diploma in Software Engineering                                  6
Higher Diploma in Telecommunications and Networking                     5
Higher Diploma in Information Technology                                4
Associate Degree in Business                                            3
Associate Degree in Information Systems Development                     3
Higher Diploma in Cloud and Data Centre Administration                  3
Associate Degree in Science in Network and Systems Administration       2
Higher Diploma in Mobile Applications Development                       2
Associate Degree Mobile Information Technology                          2
Higher Diploma in Computer Studies                                      2
Associate Degree in Mobile Information Technology                       2
Associate Degree in NSA                                                 1
Higher Diploma on Hotel Management                                      1
Ascisd                                                                  1
Higher Diploma in Multimedia                                            1
Associate Degree in Network and System Administration                   1
Associate Degree in Language and Culture                                1
Higher Diploma in Ophthalmic Dispensing                                 1
Associate Degree in Construction Engineering and Management             1
Bachelor of Information Technology                                      1
Higher Diploma in Dispensing Studies                                    1
Associate Degree in Degree Engineering                                  1
Higher Diploma in Games and Animation                                   1
Higher Diploma in Computer Network Technology                           1
Advance Certificate in Games Development (Game Design)                  1
Associate Degree in Business Administration in General Management       1
Higher Diploma Programme in Computer Game Development                   1
Associate Degree in Applied Social Sciences                             1
Higher Diploma in Tourism and Events Management                         1
Associate Degree in Statistics and Data Science                         1
Associate Degree in Science in Business Analysis                        1
Associate Degree in Film, Television and Digital Media Studies          1
Associate Degree in Information System Development                      1
Higher Diploma in Aviation Studies                                      1
Associate Degree in Science in Surveying                                1
Bachelor of Social Sciences                                             1
Higher Diploma in Mobile Application Development and Cloud Services     1
Associate Degree in Social Sciences (Sociology Theme)                   1
Higher Diploma in Aircraft Maintenance Engineering                      1
Higher Diploma in Automotive Engineering                                1
Bachelor's degree in Social Sciences (Honours) in Psychology            1
Associate Degree of Science in Information System Development           1
Higher Diploma in Computer Game Development                             1
Higher Diploma in Electronic and Information Engineering                1
Associate Degree Information Systems Development                        1
Higher Diploma in Systems Development & Administration                  1
Associate Degree in Science                                             1
Associate Degree in Arts in Media, Cultural and Creative Studies        1
Name: Programme, dtype: int64


df['Degree'] = df.Programme.str.replace(r'(^.*Associate Degree.*$)', 'Associate Degree')


df['Degree'] = df.Degree.str.replace(r'(^.*Higher Diploma.*$)', 'Higher Diploma')


df['Degree'] = df.Degree.str.replace(r'(^.*Bachelor.*$)', 'Others')


df['Degree'] = df.Degree.str.replace(r'(^.*Ascisd.*$)', 'Others')


df['Degree'] = df.Degree.str.replace(r'(^.*Advance Certificate.*$)', 'Others')


df['Degree'].value_counts()

Associate Degree    86
Higher Diploma      37
Others               5
Name: Degree, dtype: int64


df['Category'] = df.Programme.str.replace(r'(^.*Information.*$)', 'IT')


df['Category'] = df.Category.str.replace(r'(^.*Network.*$)', 'Network')
df['Category'] = df.Category.str.replace(r'(^.*Software.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Mobile.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Cloud and Data.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Computer.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Systems.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Games.*$)', 'Games')
df['Category'] = df.Category.str.replace(r'(^.*Engineering.*$)', 'Engineering')
df['Category'] = df.Category.str.replace(r'(^.*Business.*$)', 'Business')
df['Category'] = df.Category.str.replace(r'(^.*Science.*$)', 'Science')
df['Category'] = df.Category.str.replace(r'(^.*Media.*$)', 'Media ')
df['Category'] = df.Category.str.replace(r'(^.*Multimedia.*$)', 'Media ')
df['Category'] = df.Category.str.replace(r'(^.*Ophthalmic.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Management.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*NSA.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Studies.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Culture.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Ascisd.*$)', 'Others ')


df['Category'].value_counts()

IT             82
Engineering    12
Network         9
Others          8
Science         7
Business        5
Media           3
Games           2
Name: Category, dtype: int64


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 49 to 123
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       128 non-null    object 
 1   SGV                      128 non-null    int64  
 2   Assignment               128 non-null    int64  
 3   Test                     128 non-null    float64
 4   Individual Presentation  128 non-null    float64
 5   Grp Project              128 non-null    int64  
 6   Subject Mark             128 non-null    float64
 7   Subject Grade            128 non-null    object 
 8   Subject Grade Point      128 non-null    float64
 9   Gender                   128 non-null    object 
 10  Age                      128 non-null    Int64  
 11  Institute                128 non-null    object 
 12  Programme                128 non-null    object 
 13  Admission GPA            126 non-null    float64
 14  Degree                   128 non-null    object 
 15  Category                 128 non-null    object 
dtypes: Int64(1), float64(5), int64(3), object(7)
memory usage: 17.1+ KB


df['Institute'].value_counts()

PolyU HKCC                                     52
IVE                                            24
CityU Community College                        24
HKU Space                                      11
Hkbu Cie                                        6
Cuhk Scs                                        3
HK College of Technology                        2
Cityu                                           1
University of South Australia                   1
Polyu                                           1
Suzhou Chien-Shiung Institute of Technology     1
Digipen Institute of Tecnology Singapore        1
Polyu Speed                                     1
Name: Institute, dtype: int64


df['Institute'] = df.Institute.str.replace(r'(^.*Polyu Hkcc.*$)', 'PolyU HKCC')
df['Institute'] = df.Institute.str.replace(r'(^.*Polyu HKcc.*$)', 'PolyU HKCC')
df['Institute'] = df.Institute.str.replace(r'(^.*Cityu Community Coll.*$)', 'CityU Community College')
df['Institute'] = df.Institute.str.replace(r'(^.*Hku Space.*$)', 'HKU Space')
df['Institute'] = df.Institute.str.replace(r'(^.*HKBU CIE.*$)', 'Hkbu Cie')
df['Institute'] = df.Institute.str.replace(r'(^.*Hkbu Cie.*$)', 'Hkbu Cie')


df['ID'] = le.fit_transform(df['ID'])
df['SGV'] = le.fit_transform(df['SGV'])
df['Assignment'] = le.fit_transform(df['Assignment'])
df['Test'] = le.fit_transform(df['Test'])
df['Individual Presentation'] = le.fit_transform(df['Individual Presentation'])
df['Grp Project'] = le.fit_transform(df['Grp Project'])
df['Subject Mark'] = le.fit_transform(df['Subject Mark'])
df['Subject Grade'] = le.fit_transform(df['Subject Grade'])
df['Subject Grade Point'] = le.fit_transform(df['Subject Grade Point'])
df['Gender'] = le.fit_transform(df['Gender'])
df['Admission GPA'] = le.fit_transform(df['Admission GPA'])


df.to_csv('final.csv')

	Subject Yr Student ID	Assignment \nTotal: 100	Test Total: 100	Individual Presentation \nTotal: 100	Grp Project\nTotal: 100	Subject Mark	Subject Grade	Subject Grade Point	Gender	Age	Institute	Programme	Admission GPA	Unnamed: 14
0	W19S01	88	73.0	90.0	85	84.5	A	4.0	M	24.0	Polyu Hkcc	Associate in Information Technology	3.19	NaN
1	W19S02	40	65.0	65.0	50	53.0	C+	2.5	M	23.0	Cityu Community Coll	\|Associate Degree of Science in Information Sy...	2.93	NaN
2	W19S03	44	31.0	60.0	73	53.3	C+	2.5	M	27.0	IVE	Higher Diploma in Mobile Applications Development	2.68	NaN
3	W19S04	96	65.0	90.0	94	88.0	A+	4.5	M	24.0	IVE	Higher Diploma in Software Engineering	3.18	NaN
4	W19S05	58	63.0	90.0	94	76.2	A	4.0	M	23.0	Polyu HKcc	AD in Information Technology	2.36	NaN

	ID	Assignment	Test	Individual Presentation	Grp Project	Subject Mark	Subject Grade	Subject Grade Point	Gender	Age	Institute	Programme	Admission GPA
0	W19S01	88	73.0	90.0	85	84.5	A	4.0	M	24	Polyu Hkcc	Associate in Information Technology	3.19
1	W19S02	40	65.0	65.0	50	53.0	C+	2.5	M	23	Cityu Community Coll	\|Associate Degree of Science in Information Sy...	2.93
2	W19S03	44	31.0	60.0	73	53.3	C+	2.5	M	27	IVE	Higher Diploma in Mobile Applications Development	2.68
3	W19S04	96	65.0	90.0	94	88.0	A+	4.5	M	24	IVE	Higher Diploma in Software Engineering	3.18
4	W19S05	58	63.0	90.0	94	76.2	A	4.0	M	23	Polyu HKcc	AD in Information Technology	2.36

	ID	SGV	Assignment	Test	Individual Presentation	Grp Project	Subject Mark	Subject Grade	Subject Grade Point	Gender	Age	Institute	Programme	Admission GPA	Degree	Category
49	W20St09	1	94	100.0	72.9	76	85.6	A	4.0	M	23	Digipen Institute of Tecnology Singapore	Advance Certificate in Games Development (Game...	3.12	Others	Games
19	W19S20	0	48	71.0	30.0	61	52.9	C+	2.5	M	23	Cityu Community Coll	Ascisd	2.42	Others	Others
121	W21S44	1	74	98.0	59.0	70	75.0	B+	3.3	M	20	Cityu Community Coll	Associate Degree Information Systems Development	2.88	Associate Degree	IT
88	W21S09	1	70	98.0	23.0	60	63.0	C+	2.3	M	20	Hkbu Cie	Associate Degree Mobile Information Technology	2.24	Associate Degree	IT
115	W21S38	1	73	100.0	43.0	60	69.0	B-	2.7	M	20	Hkbu Cie	Associate Degree Mobile Information Technology	2.72	Associate Degree	IT

Data Cleaning Project¶

Task: Student Grades¶

Import Necessary Libraries¶

Load Data into Pandas Dataframe¶

Check Missing Values¶

Check for Duplicate Rows¶

Start Cleaning Process¶

Fixing Naming Conventions using string regex¶

Label Encoding the following Columns¶

Saving Final Dataset to CSV File¶